This report analyzes kinship likelihood ratio (LR) simulations comparing two tested hypotheses:
For each hypothesis, we examine performance across:
# Generate toy simulation data for testing the analysis pipeline
set.seed(42)
# Define factor levels
population_order <- c("all", "AfAm", "Asian", "Cauc", "Hispanic")
relationship_order <- c("parent_child", "full_siblings", "half_siblings",
"cousins", "second_cousins", "unrelated")
loci_set_order <- c("core_13", "identifiler_15", "expanded_20",
"supplementary", "autosomal_29")
# ONLY these two hypotheses are tested
tested_hypotheses <- c("parent_child", "full_siblings")
# Number of loci in each set
loci_counts <- c(
"core_13" = 13,
"identifiler_15" = 15,
"expanded_20" = 20,
"supplementary" = 23,
"autosomal_29" = 29
)
# Mean per-locus log10(LR) matrix for the TWO tested hypotheses only
mean_log10_lr_matrix <- list(
# Testing PARENT-CHILD hypothesis
parent_child = c(
"parent_child" = 0.40, # True positive
"full_siblings" = 0.10, # Related FP (siblings look somewhat like PC)
"half_siblings" = -0.05, # Related FP
"cousins" = -0.25, # Related FP
"second_cousins" = -0.30, # Related FP
"unrelated" = -0.35 # Unrelated FP
),
# Testing FULL-SIBLING hypothesis
full_siblings = c(
"parent_child" = 0.35, # Related FP (PC looks like FS)
"full_siblings" = 0.25, # True positive
"half_siblings" = 0.063, # Related FP - POSITIVE! (key finding)
"cousins" = -0.232, # Related FP
"second_cousins" = -0.283, # Related FP
"unrelated" = -0.340 # Unrelated FP
)
)
# SD of per-locus log10(LR)
sd_log10_lr_per_locus <- 0.20
# Population effects (slight variation by ancestry)
pop_effects <- c(
"all" = 0,
"AfAm" = 0.015,
"Asian" = -0.012,
"Cauc" = 0.008,
"Hispanic" = -0.008
)
n_pairs <- params$n_pairs_per_group
# Generate data - ONLY for parent_child and full_siblings hypotheses
generate_toy_data <- function() {
all_data <- list()
for (pop in population_order) {
for (known_rel in relationship_order) {
for (tested_rel in tested_hypotheses) { # Only PC and FS hypotheses
for (loci_set in loci_set_order) {
n_loci <- loci_counts[loci_set]
# Get mean per-locus LR for this known/tested combination
mean_per_locus <- mean_log10_lr_matrix[[tested_rel]][known_rel]
sd_per_locus <- sd_log10_lr_per_locus
# Add population effect
pop_effect <- pop_effects[pop]
# Combined LR is sum of per-locus log10(LR)
mean_combined <- n_loci * (mean_per_locus + pop_effect)
sd_combined <- sqrt(n_loci) * sd_per_locus
log10_lr <- rnorm(n_pairs, mean = mean_combined, sd = sd_combined)
combined_lr <- 10^log10_lr
group_data <- data.table(
batch_id = 1,
pair_id = 1:n_pairs,
population = pop,
known_relationship = known_rel,
tested_relationship = tested_rel,
tested_population = pop,
loci_set = loci_set,
combined_LR = combined_lr,
is_correct_pop = TRUE
)
all_data[[length(all_data) + 1]] <- group_data
}
}
}
}
rbindlist(all_data)
}
if (params$use_toy_data) {
cat("Generating toy simulation data...\n")
all_combined <- generate_toy_data()
cat("Generated", nrow(all_combined), "rows of toy data\n")
} else {
cat("Loading data from:", params$input_file, "\n")
all_combined <- readRDS(params$input_file)
setDT(all_combined)
}## Generating toy simulation data...
## Generated 150000 rows of toy data
## Total rows: 150000
## Unique pairs per group: 500
## Tested hypotheses: parent_child, full_siblings
relationship_labels <- c(
"parent_child" = "Parent-Child",
"full_siblings" = "Full Siblings",
"half_siblings" = "Half Siblings",
"cousins" = "Cousins",
"second_cousins" = "Second Cousins",
"unrelated" = "Unrelated"
)
loci_set_labels <- c(
"core_13" = "Core 13",
"identifiler_15" = "Identifiler 15",
"expanded_20" = "Expanded 20",
"supplementary" = "Supplementary 23",
"autosomal_29" = "Autosomal 29"
)
population_labels <- c(
"all" = "All Populations",
"AfAm" = "African American",
"Asian" = "Asian",
"Cauc" = "Caucasian",
"Hispanic" = "Hispanic"
)
all_combined[, `:=`(
population = factor(population, levels = population_order),
tested_population = factor(tested_population, levels = population_order),
known_relationship = factor(known_relationship, levels = relationship_order),
tested_relationship = factor(tested_relationship, levels = tested_hypotheses),
loci_set = factor(loci_set, levels = loci_set_order)
)]
# Add log10_LR column
all_combined[, log10_LR := log10(pmax(combined_LR, 1e-20))]
population_colors <- c(
"all" = "#FF7F00",
"AfAm" = "#E41A1C",
"Asian" = "#377EB8",
"Cauc" = "#4DAF4A",
"Hispanic" = "#984EA3"
)
relationship_colors <- c(
"parent_child" = "#1b9e77",
"full_siblings" = "#d95f02",
"half_siblings" = "#7570b3",
"cousins" = "#e7298a",
"second_cousins" = "#66a61e",
"unrelated" = "#666666"
)# Show counts by population and known relationship
data_summary <- all_combined[, .(
n_pairs = uniqueN(paste(batch_id, pair_id)),
n_tested_hypotheses = uniqueN(tested_relationship),
n_loci_sets = uniqueN(loci_set)
), by = .(population, known_relationship)]
data_summary_wide <- dcast(data_summary, known_relationship ~ population,
value.var = "n_pairs", fill = 0)
kable(data_summary_wide,
caption = "Number of simulated pairs by population and known relationship",
format.args = list(big.mark = ",")) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))| known_relationship | all | AfAm | Asian | Cauc | Hispanic |
|---|---|---|---|---|---|
| parent_child | 500 | 500 | 500 | 500 | 500 |
| full_siblings | 500 | 500 | 500 | 500 | 500 |
| half_siblings | 500 | 500 | 500 | 500 | 500 |
| cousins | 500 | 500 | 500 | 500 | 500 |
| second_cousins | 500 | 500 | 500 | 500 | 500 |
| unrelated | 500 | 500 | 500 | 500 | 500 |
cat("\nEach cell shows", params$n_pairs_per_group, "pairs tested under",
length(tested_hypotheses), "hypotheses (Parent-Child, Full-Siblings) across",
length(loci_set_order), "loci sets.\n")##
## Each cell shows 500 pairs tested under 2 hypotheses (Parent-Child, Full-Siblings) across 5 loci sets.
Testing: Are these pairs parent-child vs unrelated?
plot_lr_by_hypothesis <- function(data, tested_hyp, pop_filter = NULL) {
subset_data <- data[tested_relationship == tested_hyp]
if (!is.null(pop_filter)) {
subset_data <- subset_data[population == pop_filter]
}
title_pop <- if (is.null(pop_filter)) "All Populations" else population_labels[pop_filter]
ggplot(subset_data, aes(x = loci_set, y = log10_LR, fill = known_relationship)) +
geom_boxplot(outlier.size = 0.3, outlier.alpha = 0.2) +
facet_wrap(~known_relationship, scales = "free_y", ncol = 3,
labeller = labeller(known_relationship = relationship_labels)) +
scale_fill_manual(values = relationship_colors, guide = "none") +
scale_x_discrete(labels = loci_set_labels) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red", alpha = 0.7) +
labs(
title = paste("log10(Combined LR) - Testing", relationship_labels[tested_hyp], "Hypothesis"),
subtitle = paste(title_pop, "| Red line = LR=1 (no evidence either way)"),
x = "Loci Set",
y = "log10(Combined LR)"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8))
}
plot_lr_by_hypothesis(all_combined, "parent_child", "all")subset_pc <- all_combined[tested_relationship == "parent_child"]
ggplot(subset_pc, aes(x = loci_set, y = log10_LR, fill = known_relationship)) +
geom_boxplot(outlier.size = 0.3, outlier.alpha = 0.2) +
facet_grid(population ~ known_relationship, scales = "free_y",
labeller = labeller(population = population_labels,
known_relationship = relationship_labels)) +
scale_fill_manual(values = relationship_colors, guide = "none") +
scale_x_discrete(labels = loci_set_labels) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red", alpha = 0.5) +
labs(
title = "log10(Combined LR) - Testing Parent-Child Hypothesis",
subtitle = "Rows = Population, Columns = True Relationship. Red line = LR=1",
x = "Loci Set",
y = "log10(Combined LR)"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 7),
strip.text = element_text(size = 8))Testing: Are these pairs full siblings vs unrelated?
subset_fs <- all_combined[tested_relationship == "full_siblings"]
ggplot(subset_fs, aes(x = loci_set, y = log10_LR, fill = known_relationship)) +
geom_boxplot(outlier.size = 0.3, outlier.alpha = 0.2) +
facet_grid(population ~ known_relationship, scales = "free_y",
labeller = labeller(population = population_labels,
known_relationship = relationship_labels)) +
scale_fill_manual(values = relationship_colors, guide = "none") +
scale_x_discrete(labels = loci_set_labels) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red", alpha = 0.5) +
labs(
title = "log10(Combined LR) - Testing Full-Siblings Hypothesis",
subtitle = "Rows = Population, Columns = True Relationship. Red line = LR=1",
x = "Loci Set",
y = "log10(Combined LR)"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 7),
strip.text = element_text(size = 8))# Calculate mean log10(LR) for each combination at 29 loci
lr_means <- all_combined[loci_set == "autosomal_29", .(
mean_log10_LR = mean(log10_LR),
sd_log10_LR = sd(log10_LR),
n = .N
), by = .(population, known_relationship, tested_relationship)]
# Plot: facet by tested_relationship and population
ggplot(lr_means, aes(x = known_relationship, y = mean_log10_LR, fill = known_relationship)) +
geom_col() +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
facet_grid(tested_relationship ~ population,
labeller = labeller(population = population_labels,
tested_relationship = relationship_labels)) +
scale_fill_manual(values = relationship_colors, guide = "none") +
scale_x_discrete(labels = function(x) str_wrap(relationship_labels[x], width = 8)) +
labs(
title = "Mean log10(Combined LR) at 29 Autosomal Loci",
subtitle = "Rows = Tested Hypothesis, Columns = Population. Red line = LR=1.",
x = "True (Known) Relationship",
y = "Mean log10(Combined LR)"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
strip.text = element_text(size = 9))How to interpret:
fixed_thresholds <- params$fixed_thresholds
# Calculate proportions for all combinations
proportions_all <- all_combined[, {
props <- sapply(fixed_thresholds, function(t) mean(combined_LR > t, na.rm = TRUE))
names(props) <- paste0("prop_LR_gt_", fixed_thresholds)
as.list(c(n = .N, props))
}, by = .(population, known_relationship, tested_relationship, loci_set)]
proportions_all[, n_loci := loci_counts[as.character(loci_set)]]
# Add classification column
proportions_all[, classification := fcase(
as.character(known_relationship) == as.character(tested_relationship), "True Positive",
known_relationship == "unrelated", "Unrelated FP",
default = "Related FP"
)]
fwrite(proportions_all, file.path(params$output_dir, "proportions_all_combinations.csv"))plot_proportions <- function(data, tested_hyp, threshold_col = "prop_LR_gt_1") {
subset_data <- data[tested_relationship == tested_hyp]
ggplot(subset_data, aes(x = loci_set, y = .data[[threshold_col]],
color = known_relationship, group = known_relationship)) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
facet_wrap(~population, ncol = 5, labeller = labeller(population = population_labels)) +
scale_color_manual(values = relationship_colors, labels = relationship_labels) +
scale_x_discrete(labels = loci_set_labels) +
scale_y_continuous(labels = percent_format(), limits = c(0, 1)) +
labs(
title = paste("Proportion with LR > 1 - Testing", relationship_labels[tested_hyp], "Hypothesis"),
subtitle = "By population and true relationship",
x = "Loci Set",
y = "Proportion Exceeding LR > 1",
color = "True Relationship"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
legend.position = "bottom")
}
plot_proportions(proportions_all, "parent_child")Key observation: Note whether the half-siblings line is INCREASING or DECREASING with more loci. An increasing trend means more loci → more false identifications!
compare_data <- proportions_all[loci_set == "autosomal_29"]
ggplot(compare_data, aes(x = known_relationship, y = prop_LR_gt_1,
fill = tested_relationship)) +
geom_col(position = "dodge") +
facet_wrap(~population, ncol = 5, labeller = labeller(population = population_labels)) +
scale_fill_manual(values = c("parent_child" = "#1b9e77", "full_siblings" = "#d95f02"),
labels = relationship_labels) +
scale_x_discrete(labels = function(x) str_wrap(relationship_labels[x], width = 10)) +
scale_y_continuous(labels = percent_format()) +
labs(
title = "Comparison: Parent-Child vs Full-Siblings Hypothesis (29 Loci)",
subtitle = "Which hypothesis better discriminates each true relationship?",
x = "True Relationship",
y = "Proportion with LR > 1",
fill = "Tested Hypothesis"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "bottom")What this tests: Does the proportion exceeding LR > 1 differ significantly across loci sets?
Interpretation:
test_loci_effect <- function(data, pop, known_rel, tested_rel, threshold_col = "prop_LR_gt_1") {
subset <- data[population == pop & known_relationship == known_rel & tested_relationship == tested_rel]
if (nrow(subset) < 2) return(NULL)
n_exceed <- round(subset[[threshold_col]] * subset$n)
n_not_exceed <- subset$n - n_exceed
if (any(n_exceed < 0) || any(n_not_exceed < 0)) return(NULL)
if (sum(n_exceed) == 0 || sum(n_not_exceed) == 0) return(NULL)
contingency <- matrix(c(n_exceed, n_not_exceed), nrow = 2, byrow = TRUE,
dimnames = list(c("Exceed", "Not_Exceed"), as.character(subset$loci_set)))
test_result <- tryCatch(
chisq.test(contingency, simulate.p.value = TRUE, B = 2000),
error = function(e) NULL
)
if (is.null(test_result)) return(NULL)
data.frame(
population = pop,
known_relationship = known_rel,
tested_relationship = tested_rel,
statistic = test_result$statistic,
p_value = test_result$p.value,
significant = test_result$p.value < 0.05
)
}
# Run for all combinations
loci_tests <- rbindlist(lapply(population_order, function(pop) {
rbindlist(lapply(relationship_order, function(known_rel) {
rbindlist(lapply(tested_hypotheses, function(tested_rel) {
test_loci_effect(proportions_all, pop, known_rel, tested_rel)
}))
}))
}))
# Display for Full-Siblings hypothesis
kable(loci_tests[tested_relationship == "full_siblings",
.(population, known_relationship, statistic, p_value, significant)],
caption = "Chi-square: Loci effect on LR > 1 (Full-Siblings Hypothesis)",
digits = c(0, 0, 2, 4, 0)) %>%
kable_styling(bootstrap_options = c("striped", "hover"))| population | known_relationship | statistic | p_value | significant |
|---|---|---|---|---|
| all | half_siblings | 32.48 | 5e-04 | TRUE |
| AfAm | half_siblings | 31.54 | 5e-04 | TRUE |
| Asian | half_siblings | 39.76 | 5e-04 | TRUE |
| Cauc | half_siblings | 25.75 | 1e-03 | TRUE |
| Hispanic | half_siblings | 24.63 | 5e-04 | TRUE |
# Display for Parent-Child hypothesis
kable(loci_tests[tested_relationship == "parent_child",
.(population, known_relationship, statistic, p_value, significant)],
caption = "Chi-square: Loci effect on LR > 1 (Parent-Child Hypothesis)",
digits = c(0, 0, 2, 4, 0)) %>%
kable_styling(bootstrap_options = c("striped", "hover"))| population | known_relationship | statistic | p_value | significant |
|---|---|---|---|---|
| all | full_siblings | 33.82 | 0.0005 | TRUE |
| all | half_siblings | 24.72 | 0.0005 | TRUE |
| AfAm | full_siblings | 42.46 | 0.0005 | TRUE |
| AfAm | half_siblings | 31.21 | 0.0005 | TRUE |
| Asian | full_siblings | 12.82 | 0.0115 | TRUE |
| Asian | half_siblings | 24.37 | 0.0005 | TRUE |
| Cauc | full_siblings | 26.36 | 0.0010 | TRUE |
| Cauc | half_siblings | 20.72 | 0.0010 | TRUE |
| Hispanic | full_siblings | 38.76 | 0.0005 | TRUE |
| Hispanic | half_siblings | 31.54 | 0.0005 | TRUE |
What this tests: Does the proportion exceeding LR > 1 differ across populations?
Interpretation:
test_pop_effect <- function(data, known_rel, tested_rel, threshold_col = "prop_LR_gt_1") {
subset <- data[known_relationship == known_rel &
tested_relationship == tested_rel &
loci_set == "autosomal_29"]
if (nrow(subset) < 2) return(NULL)
n_exceed <- round(subset[[threshold_col]] * subset$n)
n_not_exceed <- subset$n - n_exceed
if (any(n_exceed < 0) || any(n_not_exceed < 0)) return(NULL)
if (sum(n_exceed) == 0 || sum(n_not_exceed) == 0) return(NULL)
contingency <- matrix(c(n_exceed, n_not_exceed), nrow = 2, byrow = TRUE,
dimnames = list(c("Exceed", "Not_Exceed"), as.character(subset$population)))
test_result <- tryCatch(
chisq.test(contingency, simulate.p.value = TRUE, B = 2000),
error = function(e) NULL
)
if (is.null(test_result)) return(NULL)
data.frame(
known_relationship = known_rel,
tested_relationship = tested_rel,
statistic = test_result$statistic,
p_value = test_result$p.value,
significant = test_result$p.value < 0.05
)
}
pop_tests <- rbindlist(lapply(relationship_order, function(known_rel) {
rbindlist(lapply(tested_hypotheses, function(tested_rel) {
test_pop_effect(proportions_all, known_rel, tested_rel)
}))
}))
kable(pop_tests,
caption = "Chi-square: Population effect on LR > 1 (29 Loci)",
digits = c(0, 0, 2, 4, 0)) %>%
kable_styling(bootstrap_options = c("striped", "hover"))| known_relationship | tested_relationship | statistic | p_value | significant |
|---|---|---|---|---|
| full_siblings | parent_child | 4.91 | 0.3478 | FALSE |
| half_siblings | parent_child | 39.33 | 0.0005 | TRUE |
| half_siblings | full_siblings | 29.29 | 0.0005 | TRUE |
What this tests: Is there a significant linear trend in proportion as loci increase?
Interpretation:
| Metric | Meaning |
|---|---|
| slope > 0 | Proportion INCREASES with more loci |
| slope < 0 | Proportion DECREASES with more loci |
| p < 0.05 | Trend is statistically significant |
| R² > 0.8 | Very consistent linear relationship |
trend_tests <- proportions_all[, {
if (.N < 3) return(NULL)
model <- tryCatch(lm(prop_LR_gt_1 ~ n_loci), error = function(e) NULL)
if (is.null(model)) return(NULL)
summary_model <- summary(model)
coef_table <- coef(summary_model)
if (nrow(coef_table) < 2) return(NULL)
list(
slope = coef_table[2, 1],
slope_se = coef_table[2, 2],
p_value = coef_table[2, 4],
r_squared = summary_model$r.squared,
trend = ifelse(coef_table[2, 1] > 0, "INCREASING", "DECREASING"),
significant = coef_table[2, 4] < 0.05
)
}, by = .(population, known_relationship, tested_relationship)]
# Display for full_siblings hypothesis
kable(trend_tests[tested_relationship == "full_siblings",
.(population, known_relationship, slope, p_value, r_squared, trend, significant)],
caption = "Linear trend: Proportion vs N loci (Full-Siblings Hypothesis)",
digits = c(0, 0, 5, 4, 3, 0, 0)) %>%
kable_styling(bootstrap_options = c("striped", "hover"))| population | known_relationship | slope | p_value | r_squared | trend | significant |
|---|---|---|---|---|---|---|
| all | parent_child | 0.00000 | 0.2735 | 0.488 | INCREASING | FALSE |
| all | full_siblings | 0.00000 | 0.2735 | 0.488 | INCREASING | FALSE |
| all | half_siblings | 0.00535 | 0.0143 | 0.898 | INCREASING | TRUE |
| all | cousins | 0.00000 | NaN | NaN | DECREASING | NA |
| all | second_cousins | 0.00000 | NaN | NaN | DECREASING | NA |
| all | unrelated | 0.00000 | NaN | NaN | DECREASING | NA |
| AfAm | parent_child | 0.00000 | 0.2735 | 0.488 | INCREASING | FALSE |
| AfAm | full_siblings | 0.00000 | 0.2735 | 0.488 | INCREASING | FALSE |
| AfAm | half_siblings | 0.00380 | 0.0294 | 0.837 | INCREASING | TRUE |
| AfAm | cousins | 0.00000 | NaN | NaN | DECREASING | NA |
| AfAm | second_cousins | 0.00000 | NaN | NaN | DECREASING | NA |
| AfAm | unrelated | 0.00000 | NaN | NaN | DECREASING | NA |
| Asian | parent_child | 0.00000 | 0.2735 | 0.488 | INCREASING | FALSE |
| Asian | full_siblings | 0.00000 | 0.2735 | 0.488 | INCREASING | FALSE |
| Asian | half_siblings | 0.00739 | 0.0098 | 0.920 | INCREASING | TRUE |
| Asian | cousins | 0.00000 | NaN | NaN | DECREASING | NA |
| Asian | second_cousins | 0.00000 | NaN | NaN | DECREASING | NA |
| Asian | unrelated | 0.00000 | NaN | NaN | DECREASING | NA |
| Cauc | parent_child | 0.00000 | 0.2735 | 0.488 | INCREASING | FALSE |
| Cauc | full_siblings | 0.00000 | 0.2735 | 0.488 | INCREASING | FALSE |
| Cauc | half_siblings | 0.00393 | 0.0180 | 0.882 | INCREASING | TRUE |
| Cauc | cousins | 0.00000 | NaN | NaN | DECREASING | NA |
| Cauc | second_cousins | 0.00000 | NaN | NaN | DECREASING | NA |
| Cauc | unrelated | 0.00000 | NaN | NaN | DECREASING | NA |
| Hispanic | parent_child | 0.00000 | 0.2735 | 0.488 | INCREASING | FALSE |
| Hispanic | full_siblings | 0.00000 | 0.2735 | 0.488 | INCREASING | FALSE |
| Hispanic | half_siblings | 0.00539 | 0.0054 | 0.946 | INCREASING | TRUE |
| Hispanic | cousins | 0.00000 | NaN | NaN | DECREASING | NA |
| Hispanic | second_cousins | 0.00000 | NaN | NaN | DECREASING | NA |
| Hispanic | unrelated | 0.00000 | NaN | NaN | DECREASING | NA |
# Display for parent_child hypothesis
kable(trend_tests[tested_relationship == "parent_child",
.(population, known_relationship, slope, p_value, r_squared, trend, significant)],
caption = "Linear trend: Proportion vs N loci (Parent-Child Hypothesis)",
digits = c(0, 0, 5, 4, 3, 0, 0)) %>%
kable_styling(bootstrap_options = c("striped", "hover"))| population | known_relationship | slope | p_value | r_squared | trend | significant |
|---|---|---|---|---|---|---|
| all | parent_child | 0.00000 | 0.2735 | 0.488 | INCREASING | FALSE |
| all | full_siblings | 0.00223 | 0.0582 | 0.749 | INCREASING | FALSE |
| all | half_siblings | -0.00517 | 0.0580 | 0.749 | DECREASING | FALSE |
| all | cousins | 0.00000 | NaN | NaN | DECREASING | NA |
| all | second_cousins | 0.00000 | NaN | NaN | DECREASING | NA |
| all | unrelated | 0.00000 | NaN | NaN | DECREASING | NA |
| AfAm | parent_child | 0.00000 | 0.2735 | 0.488 | INCREASING | FALSE |
| AfAm | full_siblings | 0.00167 | 0.1149 | 0.618 | INCREASING | FALSE |
| AfAm | half_siblings | -0.00788 | 0.0048 | 0.950 | DECREASING | TRUE |
| AfAm | cousins | 0.00000 | NaN | NaN | DECREASING | NA |
| AfAm | second_cousins | 0.00000 | NaN | NaN | DECREASING | NA |
| AfAm | unrelated | 0.00000 | NaN | NaN | DECREASING | NA |
| Asian | parent_child | 0.00000 | 0.2735 | 0.488 | INCREASING | FALSE |
| Asian | full_siblings | 0.00195 | 0.0052 | 0.947 | INCREASING | TRUE |
| Asian | half_siblings | -0.00393 | 0.0844 | 0.683 | DECREASING | FALSE |
| Asian | cousins | 0.00000 | NaN | NaN | DECREASING | NA |
| Asian | second_cousins | 0.00000 | NaN | NaN | DECREASING | NA |
| Asian | unrelated | 0.00000 | NaN | NaN | DECREASING | NA |
| Cauc | parent_child | 0.00000 | 0.2735 | 0.488 | INCREASING | FALSE |
| Cauc | full_siblings | 0.00185 | 0.0415 | 0.797 | INCREASING | TRUE |
| Cauc | half_siblings | -0.00568 | 0.0092 | 0.923 | DECREASING | TRUE |
| Cauc | cousins | 0.00000 | NaN | NaN | DECREASING | NA |
| Cauc | second_cousins | 0.00000 | NaN | NaN | DECREASING | NA |
| Cauc | unrelated | 0.00000 | NaN | NaN | DECREASING | NA |
| Hispanic | parent_child | 0.00000 | 0.2735 | 0.488 | INCREASING | FALSE |
| Hispanic | full_siblings | 0.00304 | 0.0306 | 0.833 | INCREASING | TRUE |
| Hispanic | half_siblings | -0.00596 | 0.0057 | 0.944 | DECREASING | TRUE |
| Hispanic | cousins | 0.00000 | NaN | NaN | DECREASING | NA |
| Hispanic | second_cousins | 0.00000 | NaN | NaN | DECREASING | NA |
| Hispanic | unrelated | 0.00000 | NaN | NaN | DECREASING | NA |
Critical finding: If half_siblings under Full-Siblings hypothesis shows: - trend = “INCREASING” - significant = TRUE
This confirms that more loci increases the rate at which half-siblings are falsely identified as full siblings.
summary_29 <- proportions_all[loci_set == "autosomal_29"]
ggplot(summary_29, aes(x = known_relationship, y = prop_LR_gt_1, fill = classification)) +
geom_col(position = "dodge") +
facet_grid(tested_relationship ~ population,
labeller = labeller(population = population_labels,
tested_relationship = relationship_labels)) +
scale_fill_manual(values = c("True Positive" = "#2ca02c",
"Related FP" = "#ff7f0e",
"Unrelated FP" = "#d62728")) +
scale_x_discrete(labels = function(x) str_wrap(relationship_labels[x], width = 8)) +
scale_y_continuous(labels = percent_format()) +
labs(
title = "Classification Performance at 29 Autosomal Loci",
subtitle = "Rows = Tested Hypothesis, Columns = Population",
x = "True Relationship",
y = "Proportion with LR > 1",
fill = "Classification"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
legend.position = "bottom")rates_table <- proportions_all[
loci_set == "autosomal_29",
.(population, known_relationship, tested_relationship, classification,
prop_LR_gt_1, prop_LR_gt_10, prop_LR_gt_100, prop_LR_gt_1000)
]
setorder(rates_table, tested_relationship, population, classification, known_relationship)
kable(rates_table,
caption = "Detailed rates at 29 Autosomal Loci",
digits = 4) %>%
kable_styling(bootstrap_options = c("striped", "hover"), font_size = 10) %>%
scroll_box(height = "500px")| population | known_relationship | tested_relationship | classification | prop_LR_gt_1 | prop_LR_gt_10 | prop_LR_gt_100 | prop_LR_gt_1000 |
|---|---|---|---|---|---|---|---|
| all | full_siblings | parent_child | Related FP | 0.996 | 0.958 | 0.796 | 0.436 |
| all | half_siblings | parent_child | Related FP | 0.104 | 0.020 | 0.000 | 0.000 |
| all | cousins | parent_child | Related FP | 0.000 | 0.000 | 0.000 | 0.000 |
| all | second_cousins | parent_child | Related FP | 0.000 | 0.000 | 0.000 | 0.000 |
| all | parent_child | parent_child | True Positive | 1.000 | 1.000 | 1.000 | 1.000 |
| all | unrelated | parent_child | Unrelated FP | 0.000 | 0.000 | 0.000 | 0.000 |
| AfAm | full_siblings | parent_child | Related FP | 1.000 | 0.990 | 0.876 | 0.632 |
| AfAm | half_siblings | parent_child | Related FP | 0.160 | 0.032 | 0.004 | 0.000 |
| AfAm | cousins | parent_child | Related FP | 0.000 | 0.000 | 0.000 | 0.000 |
| AfAm | second_cousins | parent_child | Related FP | 0.000 | 0.000 | 0.000 | 0.000 |
| AfAm | parent_child | parent_child | True Positive | 1.000 | 1.000 | 1.000 | 1.000 |
| AfAm | unrelated | parent_child | Unrelated FP | 0.000 | 0.000 | 0.000 | 0.000 |
| Asian | full_siblings | parent_child | Related FP | 0.992 | 0.910 | 0.680 | 0.320 |
| Asian | half_siblings | parent_child | Related FP | 0.062 | 0.002 | 0.000 | 0.000 |
| Asian | cousins | parent_child | Related FP | 0.000 | 0.000 | 0.000 | 0.000 |
| Asian | second_cousins | parent_child | Related FP | 0.000 | 0.000 | 0.000 | 0.000 |
| Asian | parent_child | parent_child | True Positive | 1.000 | 1.000 | 1.000 | 1.000 |
| Asian | unrelated | parent_child | Unrelated FP | 0.000 | 0.000 | 0.000 | 0.000 |
| Cauc | full_siblings | parent_child | Related FP | 0.998 | 0.972 | 0.854 | 0.570 |
| Cauc | half_siblings | parent_child | Related FP | 0.116 | 0.016 | 0.000 | 0.000 |
| Cauc | cousins | parent_child | Related FP | 0.000 | 0.000 | 0.000 | 0.000 |
| Cauc | second_cousins | parent_child | Related FP | 0.000 | 0.000 | 0.000 | 0.000 |
| Cauc | parent_child | parent_child | True Positive | 1.000 | 1.000 | 1.000 | 1.000 |
| Cauc | unrelated | parent_child | Unrelated FP | 0.000 | 0.000 | 0.000 | 0.000 |
| Hispanic | full_siblings | parent_child | Related FP | 0.996 | 0.948 | 0.750 | 0.402 |
| Hispanic | half_siblings | parent_child | Related FP | 0.058 | 0.004 | 0.002 | 0.000 |
| Hispanic | cousins | parent_child | Related FP | 0.000 | 0.000 | 0.000 | 0.000 |
| Hispanic | second_cousins | parent_child | Related FP | 0.000 | 0.000 | 0.000 | 0.000 |
| Hispanic | parent_child | parent_child | True Positive | 1.000 | 1.000 | 1.000 | 1.000 |
| Hispanic | unrelated | parent_child | Unrelated FP | 0.000 | 0.000 | 0.000 | 0.000 |
| all | parent_child | full_siblings | Related FP | 1.000 | 1.000 | 1.000 | 1.000 |
| all | half_siblings | full_siblings | Related FP | 0.968 | 0.802 | 0.428 | 0.142 |
| all | cousins | full_siblings | Related FP | 0.000 | 0.000 | 0.000 | 0.000 |
| all | second_cousins | full_siblings | Related FP | 0.000 | 0.000 | 0.000 | 0.000 |
| all | full_siblings | full_siblings | True Positive | 1.000 | 1.000 | 1.000 | 1.000 |
| all | unrelated | full_siblings | Unrelated FP | 0.000 | 0.000 | 0.000 | 0.000 |
| AfAm | parent_child | full_siblings | Related FP | 1.000 | 1.000 | 1.000 | 1.000 |
| AfAm | half_siblings | full_siblings | Related FP | 0.978 | 0.872 | 0.588 | 0.268 |
| AfAm | cousins | full_siblings | Related FP | 0.000 | 0.000 | 0.000 | 0.000 |
| AfAm | second_cousins | full_siblings | Related FP | 0.000 | 0.000 | 0.000 | 0.000 |
| AfAm | full_siblings | full_siblings | True Positive | 1.000 | 1.000 | 1.000 | 1.000 |
| AfAm | unrelated | full_siblings | Unrelated FP | 0.000 | 0.000 | 0.000 | 0.000 |
| Asian | parent_child | full_siblings | Related FP | 1.000 | 1.000 | 1.000 | 1.000 |
| Asian | half_siblings | full_siblings | Related FP | 0.916 | 0.690 | 0.352 | 0.086 |
| Asian | cousins | full_siblings | Related FP | 0.000 | 0.000 | 0.000 | 0.000 |
| Asian | second_cousins | full_siblings | Related FP | 0.000 | 0.000 | 0.000 | 0.000 |
| Asian | full_siblings | full_siblings | True Positive | 1.000 | 1.000 | 1.000 | 1.000 |
| Asian | unrelated | full_siblings | Unrelated FP | 0.000 | 0.000 | 0.000 | 0.000 |
| Cauc | parent_child | full_siblings | Related FP | 1.000 | 1.000 | 1.000 | 1.000 |
| Cauc | half_siblings | full_siblings | Related FP | 0.966 | 0.824 | 0.504 | 0.186 |
| Cauc | cousins | full_siblings | Related FP | 0.000 | 0.000 | 0.000 | 0.000 |
| Cauc | second_cousins | full_siblings | Related FP | 0.000 | 0.000 | 0.000 | 0.000 |
| Cauc | full_siblings | full_siblings | True Positive | 1.000 | 1.000 | 1.000 | 1.000 |
| Cauc | unrelated | full_siblings | Unrelated FP | 0.000 | 0.000 | 0.000 | 0.000 |
| Hispanic | parent_child | full_siblings | Related FP | 1.000 | 1.000 | 1.000 | 1.000 |
| Hispanic | half_siblings | full_siblings | Related FP | 0.938 | 0.706 | 0.360 | 0.096 |
| Hispanic | cousins | full_siblings | Related FP | 0.000 | 0.000 | 0.000 | 0.000 |
| Hispanic | second_cousins | full_siblings | Related FP | 0.000 | 0.000 | 0.000 | 0.000 |
| Hispanic | full_siblings | full_siblings | True Positive | 1.000 | 1.000 | 1.000 | 1.000 |
| Hispanic | unrelated | full_siblings | Unrelated FP | 0.000 | 0.000 | 0.000 | 0.000 |
## === KEY FINDINGS ===
## 1. TRUE POSITIVE RATES (at 29 loci, LR > 1):
tp_rates <- proportions_all[loci_set == "autosomal_29" &
as.character(known_relationship) == as.character(tested_relationship),
.(population, tested_relationship, prop_LR_gt_1)]
print(dcast(tp_rates, tested_relationship ~ population, value.var = "prop_LR_gt_1"))## Key: <tested_relationship>
## tested_relationship all AfAm Asian Cauc Hispanic
## <fctr> <num> <num> <num> <num> <num>
## 1: parent_child 1 1 1 1 1
## 2: full_siblings 1 1 1 1 1
##
## 2. UNRELATED FALSE POSITIVE RATES (at 29 loci, LR > 1):
unrel_fp <- proportions_all[loci_set == "autosomal_29" &
known_relationship == "unrelated",
.(population, tested_relationship, prop_LR_gt_1)]
print(dcast(unrel_fp, tested_relationship ~ population, value.var = "prop_LR_gt_1"))## Key: <tested_relationship>
## tested_relationship all AfAm Asian Cauc Hispanic
## <fctr> <num> <num> <num> <num> <num>
## 1: parent_child 0 0 0 0 0
## 2: full_siblings 0 0 0 0 0
##
## 3. HALF-SIBLING AS FULL-SIBLING FALSE ID (at 29 loci):
hs_fp <- proportions_all[loci_set == "autosomal_29" &
known_relationship == "half_siblings" &
tested_relationship == "full_siblings",
.(population, prop_LR_gt_1, prop_LR_gt_10, prop_LR_gt_100)]
print(hs_fp)## population prop_LR_gt_1 prop_LR_gt_10 prop_LR_gt_100
## <fctr> <num> <num> <num>
## 1: all 0.968 0.802 0.428
## 2: AfAm 0.978 0.872 0.588
## 3: Asian 0.916 0.690 0.352
## 4: Cauc 0.966 0.824 0.504
## 5: Hispanic 0.938 0.706 0.360
##
## 4. TREND DIRECTIONS (Full-Siblings hypothesis, half-siblings):
hs_trend <- trend_tests[tested_relationship == "full_siblings" & known_relationship == "half_siblings"]
print(hs_trend[, .(population, trend, slope, p_value, significant)])## population trend slope p_value significant
## <fctr> <char> <num> <num> <lgcl>
## 1: all INCREASING 0.005353659 0.014302191 TRUE
## 2: AfAm INCREASING 0.003804878 0.029444892 TRUE
## 3: Asian INCREASING 0.007390244 0.009790329 TRUE
## 4: Cauc INCREASING 0.003926829 0.017968074 TRUE
## 5: Hispanic INCREASING 0.005390244 0.005364040 TRUE
| What to Check | Good Sign | Concerning Sign |
|---|---|---|
| True Positive Rate | High (>90%) and increasing with loci | Low or decreasing |
| Unrelated FP Rate | Low (<5%) and decreasing with loci | High or increasing |
| Related FP Rate (e.g., half-sibs) | Low and decreasing with loci | High and INCREASING |
| Population effect | Not significant | Highly significant |
## Analysis completed: 2025-12-12 14:09:38
## Output directory: ../output/analysis_results
## Generated files:
## - classification_summary.png
## - combined_LR_summary_by_loci.csv
## - detailed_rates_29loci.csv
## - false_identification_rates.png
## - fixed_threshold_trends.png
## - mean_log10_LR_summary.csv
## - proportions_all_combinations.csv
## - proportions_fixed_thresholds.csv
## - stat_test_loci_effect_stratified.csv
## - stat_test_loci_effect.csv
## - stat_test_population_effect.csv
## - stat_test_trend_analysis.csv
## - stat_test_trend_stratified.csv
## - stat_test_trend.csv
## - summary_rates.csv
## - tpr_vs_fpr.png
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.1
##
## Matrix products: default
## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/Detroit
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] kableExtra_1.4.0 knitr_1.50 scales_1.4.0 data.table_1.17.4
## [5] lubridate_1.9.4 forcats_1.0.0 stringr_1.5.1 dplyr_1.1.4
## [9] purrr_1.0.4 readr_2.1.5 tidyr_1.3.1 tibble_3.2.1
## [13] ggplot2_4.0.0 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] sass_0.4.10 generics_0.1.4 xml2_1.3.8 stringi_1.8.7
## [5] hms_1.1.3 digest_0.6.37 magrittr_2.0.3 evaluate_1.0.3
## [9] grid_4.5.2 timechange_0.3.0 RColorBrewer_1.1-3 fastmap_1.2.0
## [13] jsonlite_2.0.0 viridisLite_0.4.2 textshaping_1.0.1 jquerylib_0.1.4
## [17] cli_3.6.5 rlang_1.1.6 withr_3.0.2 cachem_1.1.0
## [21] yaml_2.3.10 tools_4.5.2 tzdb_0.5.0 vctrs_0.6.5
## [25] R6_2.6.1 lifecycle_1.0.4 ragg_1.4.0 pkgconfig_2.0.3
## [29] pillar_1.10.2 bslib_0.9.0 gtable_0.3.6 glue_1.8.0
## [33] systemfonts_1.3.1 xfun_0.54 tidyselect_1.2.1 rstudioapi_0.17.1
## [37] dichromat_2.0-0.1 farver_2.1.2 htmltools_0.5.8.1 rmarkdown_2.29
## [41] svglite_2.2.1 labeling_0.4.3 compiler_4.5.2 S7_0.2.0